{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getFile(dir):\n",
    "    files=os.listdir(dir)\n",
    "    imags=[]\n",
    "    labels=[]\n",
    "    for file_name in files:\n",
    "        \n",
    "        imags.append(''.join(np.loadtxt(f'{dir}/{file_name}',dtype=str)))\n",
    "        labels.append(file_name.split('_')[0])\n",
    "    return pd.DataFrame({\n",
    "        \"imags\":imags,\n",
    "        \"labels\":labels\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getFiles(dirs):\n",
    "    return list(map(getFile,dirs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "[test,train]=getFiles([\"digits/testDigits\",\"digits/trainingDigits\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "imags     0000000000000100000000000000000000000000001111...\n",
       "labels                                                    3\n",
       "Name: 300, dtype: object"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.iloc[300]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def hamn(A,B):\n",
    "    return sum([a!=b for(a,b) in zip(A,B)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       0000000000000111100000000000000000000000000011...\n",
       "1       0000000000011111000000000000000000000000001111...\n",
       "2       0000000000000011000000000000000000000000000001...\n",
       "3       0000000000001111000000000000000000000000000111...\n",
       "4       0000000000000011111000000000000000000000000011...\n",
       "                              ...                        \n",
       "1929    0000000000000000000100000000000000000000000000...\n",
       "1930    0000000000000000000000010000000000000000000000...\n",
       "1931    0000000000000000000000010000000000000000000000...\n",
       "1932    0000000000000011111100000000000000000000000111...\n",
       "1933    0000000000000011111100000000000000000000000001...\n",
       "Name: imags, Length: 1934, dtype: object"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.iloc[:,0].apply(lambda x:x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>imags</th>\n",
       "      <th>labels</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0000000000000111100000000000000000000000000011...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0000000000011111000000000000000000000000001111...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0000000000000011000000000000000000000000000001...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0000000000001111000000000000000000000000000111...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0000000000000011111000000000000000000000000011...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1929</th>\n",
       "      <td>0000000000000000000100000000000000000000000000...</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1930</th>\n",
       "      <td>0000000000000000000000010000000000000000000000...</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1931</th>\n",
       "      <td>0000000000000000000000010000000000000000000000...</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1932</th>\n",
       "      <td>0000000000000011111100000000000000000000000111...</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1933</th>\n",
       "      <td>0000000000000011111100000000000000000000000001...</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1934 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  imags labels\n",
       "0     0000000000000111100000000000000000000000000011...      0\n",
       "1     0000000000011111000000000000000000000000001111...      0\n",
       "2     0000000000000011000000000000000000000000000001...      0\n",
       "3     0000000000001111000000000000000000000000000111...      0\n",
       "4     0000000000000011111000000000000000000000000011...      0\n",
       "...                                                 ...    ...\n",
       "1929  0000000000000000000100000000000000000000000000...      9\n",
       "1930  0000000000000000000000010000000000000000000000...      9\n",
       "1931  0000000000000000000000010000000000000000000000...      9\n",
       "1932  0000000000000011111100000000000000000000000111...      9\n",
       "1933  0000000000000011111100000000000000000000000001...      9\n",
       "\n",
       "[1934 rows x 2 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "A0=\"00000000000011111100000000000000000000000001111111110000000000000000000001111111111100000000000000000001111111111111100000000000000000001111111000111100000000000000000111111000000111000000000000000001111110000001110000000000000000111111000000011100000000000000001111110000000111100000000000000011111100000000011100000000000000111111000000000111000000000000001111100000000000111000000000000011111000000000001110000000000000011111000000000001110000000000000111110000000000011100000000000001111100000000000111000000000000111110000000000001110000000000001111100000000000011100000000000000111100000000000011100000000000001111000000000001111000000000000011110000000000011110000000000000111100000000001111100000000000000111100000000001111100000000000001111100000000111110000000000000011111000000011111100000000000000111110000001111110000000000000001111110001111111100000000000000000111111111111110000000000000000001111111111111000000000000000000001111111111000000000000000000000000111110000000000000\"\n",
    "B0=\"0000000000000001100000000000000000000000000111111110000000000000000000000011111111111000000000000000000000111111111111000000000000000000011111111111111000000000000000000111111000111110000000000000000011111000000111100000000000000000111110000001111100000000000000001111100000001111100000000000000111111000000011111000000000000001111110000000011111000000000000011111100000000011110000000000000111111000000000111110000000000001111100000000001111000000000000011110000000000011110000000000000111100000000000111100000000000001111000000000000111000000000000001111000000000001110000000000000011110000000000011100000000000000111100000000000111000000000000011110000000000111100000000000000111100000000001111000000000000000111000000000011110000000000000001111100000111111100000000000000011111000111111110000000000000000111111111111111000000000000000000111111111111110000000000000000011111111111110000000000000000000011111111111000000000000000000000011111110000000000000000000000000111110000000000000000000000000000100000000000000000000\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "训练算法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def knn(inX,df,k):\n",
    "    #计算输入inX和df集合每一个样本的距离\n",
    "    dist=df.iloc[:,0].apply(lambda x:hamn(inX,x))\n",
    "    dist_l=pd.DataFrame({\n",
    "        'dist':dist,\n",
    "        'label':df.iloc[:,-1]\n",
    "\n",
    "    })\n",
    "    dist_k = (dist_l.sort_values(by='dist').iloc[:k])\n",
    "    return dist_k.value_counts('label').index[0]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "knn(B0,test,3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "imags     0000000000000110000000000000000000000000000011...\n",
       "labels                                                    0\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.iloc[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "测试算法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def digitsTest(test,train,k):\n",
    "    predict = []\n",
    "    for _, row in test.iterrows():\n",
    "        #row代表样本，第一列是img,第二列是label\n",
    "        predict.append(knn(row[0],train,k))\n",
    "    return np.mean(predict == test.iloc[:,-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9894291754756871"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "digitsTest(test,train,3)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "a6dc62afd8b03c17538a9dfce2fcb18f62cec380cc7b77050462a64b7e4e4814"
  },
  "kernelspec": {
   "display_name": "Python 3.8.0 32-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
